Acquiring the data

In [1]:
# Running first cell for *weekly* "Search Trends" dataset collection, cleaning, and partial display

import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
url = 'symptoms_weekly.csv'
df1 = pd.read_csv(url,index_col=0,parse_dates=[0])
# Find the columns where every value is null
empty_cols = [col for col in df1.columns if df1[col].isnull().all()]
# Drop these columns from the dataframe
df1.drop(empty_cols,
        axis=1,
        inplace=True)

df1 = df1[df1['sub_region_1_code'].notna()]
df1['open_covid_region_code']=df1['sub_region_1_code']
df1['state_codes']=df1['open_covid_region_code'].str[3:]
#drop 'sub_region_1_code' column bc redundant
redundant = ['place_id']
df1=df1.drop(columns=redundant)
# Delete rows which contains less than 30 non NaN values
df1=df1.dropna(thresh=390)
df1['date']= pd.to_datetime(df1['date'])
#create multi-index constructed from date and open_covid_region_code
df1 = df1.set_index(['open_covid_region_code','date'])
print (df1.head())
print (df1.shape)
<ipython-input-1-b2370e85b0db>:7: FutureWarning: Passing a negative integer is deprecated in version 1.0 and will not be supported in future version. Instead, use None to not limit the column width.
  pd.set_option('display.max_colwidth', -1)
                                  country_region sub_region_1  \
open_covid_region_code date                                     
US-AL                  2020-01-06  United States  Alabama       
                       2020-01-13  United States  Alabama       
                       2020-01-20  United States  Alabama       
                       2020-01-27  United States  Alabama       
                       2020-02-03  United States  Alabama       

                                  sub_region_1_code  \
open_covid_region_code date                           
US-AL                  2020-01-06  US-AL              
                       2020-01-13  US-AL              
                       2020-01-20  US-AL              
                       2020-01-27  US-AL              
                       2020-02-03  US-AL              

                                   symptom:Abdominal obesity  \
open_covid_region_code date                                    
US-AL                  2020-01-06  3.33                        
                       2020-01-13  3.43                        
                       2020-01-20  3.32                        
                       2020-01-27  3.17                        
                       2020-02-03  3.29                        

                                   symptom:Abdominal pain  ...  \
open_covid_region_code date                                ...   
US-AL                  2020-01-06  6.25                    ...   
                       2020-01-13  6.27                    ...   
                       2020-01-20  6.29                    ...   
                       2020-01-27  6.22                    ...   
                       2020-02-03  6.20                    ...   

                                   symptom:Xerostomia  symptom:Yawn  \
open_covid_region_code date                                           
US-AL                  2020-01-06  0.60                0.23           
                       2020-01-13  0.60                0.24           
                       2020-01-20  0.66                0.22           
                       2020-01-27  0.62                0.19           
                       2020-02-03  0.59                0.20           

                                   symptom:hyperhidrosis  \
open_covid_region_code date                                
US-AL                  2020-01-06  0.64                    
                       2020-01-13  0.73                    
                       2020-01-20  0.64                    
                       2020-01-27  0.68                    
                       2020-02-03  0.69                    

                                   symptom:pancreatitis  state_codes  
open_covid_region_code date                                           
US-AL                  2020-01-06  0.60                  AL           
                       2020-01-13  0.58                  AL           
                       2020-01-20  0.60                  AL           
                       2020-01-27  0.55                  AL           
                       2020-02-03  0.58                  AL           

[5 rows x 426 columns]
(2649, 426)
In [2]:
# Running second cell for "COVID Hospitalization Cases" dataset collection, cleaning, and partial display

import numpy as np
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
url = 'https://raw.githubusercontent.com/google-research/open-covid-19-data/master/data/exports/cc_by/aggregated_cc_by.csv'
df2 = pd.read_csv(url,index_col=0,parse_dates=[0])
#replace 0 values with null
df2 = df2.replace({0.0:np.nan})
#get rid of irrelevant features
relevant_col = ['region_name', 'date', 'hospitalized_new']
df2=df2[relevant_col]
# Delete rows which have no value for hospitalized_new
df2=df2.dropna(subset = ['hospitalized_new'])
#convert date format and resample to weekly
df2['date'] = pd.to_datetime(df2['date'])
df2=df2.set_index('date').groupby([df2.index, 'region_name']).resample('W', label='left', loffset=pd.DateOffset(days=1)).sum().fillna(0).reset_index().set_index('open_covid_region_code')
#create multi-index constructed from open_covid_region_code and 'date'
df2 = df2.set_index([df2.index, 'date'])
print (df2.head(10))
print (df2.shape)
<ipython-input-2-7c6d8d764758>:7: FutureWarning: Passing a negative integer is deprecated in version 1.0 and will not be supported in future version. Instead, use None to not limit the column width.
  pd.set_option('display.max_colwidth', -1)
/Users/patrickiskandar/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3071: DtypeWarning: Columns (15) have mixed types.Specify dtype option on import or set low_memory=False.
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
                                   region_name  hospitalized_new
open_covid_region_code date                                     
CHE                    2020-02-24  Switzerland  49.0            
                       2020-03-02  Switzerland  106.0           
                       2020-03-09  Switzerland  339.0           
                       2020-03-16  Switzerland  819.0           
                       2020-03-23  Switzerland  1147.0          
                       2020-03-30  Switzerland  630.0           
                       2020-04-06  Switzerland  340.0           
                       2020-04-13  Switzerland  212.0           
                       2020-04-20  Switzerland  97.0            
                       2020-04-27  Switzerland  59.0            
(5951, 2)
In [3]:
# Running third cell for merging the weekly symptoms and hospitalization datasets. One dataframe object for visualization and one for machine learning algorithms.
# Both dataframes objects are shown partially.

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
inner_df = pd.merge(df1, df2, left_index=True, right_index=True)
#inner_df contains only points that have data for both symptoms and hospitalizations
left_df = pd.merge(df1, df2, left_index=True, right_index=True, how='left')
#left_df contains all rows in df1, even if hospitalized_new value is null
print (left_df.head(5))
print(left_df.shape)
print (inner_df.head(5))
print (inner_df.shape)
left_df.to_csv('out.csv')
                                  country_region sub_region_1  \
open_covid_region_code date                                     
US-AL                  2020-01-06  United States  Alabama       
                       2020-01-13  United States  Alabama       
                       2020-01-20  United States  Alabama       
                       2020-01-27  United States  Alabama       
                       2020-02-03  United States  Alabama       

                                  sub_region_1_code  \
open_covid_region_code date                           
US-AL                  2020-01-06  US-AL              
                       2020-01-13  US-AL              
                       2020-01-20  US-AL              
                       2020-01-27  US-AL              
                       2020-02-03  US-AL              

                                   symptom:Abdominal obesity  \
open_covid_region_code date                                    
US-AL                  2020-01-06  3.33                        
                       2020-01-13  3.43                        
                       2020-01-20  3.32                        
                       2020-01-27  3.17                        
                       2020-02-03  3.29                        

                                   symptom:Abdominal pain  ...  \
open_covid_region_code date                                ...   
US-AL                  2020-01-06  6.25                    ...   
                       2020-01-13  6.27                    ...   
                       2020-01-20  6.29                    ...   
                       2020-01-27  6.22                    ...   
                       2020-02-03  6.20                    ...   

                                   symptom:hyperhidrosis  \
open_covid_region_code date                                
US-AL                  2020-01-06  0.64                    
                       2020-01-13  0.73                    
                       2020-01-20  0.64                    
                       2020-01-27  0.68                    
                       2020-02-03  0.69                    

                                   symptom:pancreatitis  state_codes  \
open_covid_region_code date                                            
US-AL                  2020-01-06  0.60                  AL            
                       2020-01-13  0.58                  AL            
                       2020-01-20  0.60                  AL            
                       2020-01-27  0.55                  AL            
                       2020-02-03  0.58                  AL            

                                   region_name  hospitalized_new  
open_covid_region_code date                                       
US-AL                  2020-01-06  NaN         NaN                
                       2020-01-13  NaN         NaN                
                       2020-01-20  NaN         NaN                
                       2020-01-27  NaN         NaN                
                       2020-02-03  NaN         NaN                

[5 rows x 428 columns]
(2649, 428)
                                  country_region  sub_region_1  \
open_covid_region_code date                                      
US-WI                  2020-03-30  United States  Wisconsin      
US-ME                  2020-11-30  United States  Maine          
US-AR                  2020-11-23  United States  Arkansas       
US-HI                  2020-06-08  United States  Hawaii         
US-RI                  2020-09-07  United States  Rhode Island   

                                  sub_region_1_code  \
open_covid_region_code date                           
US-WI                  2020-03-30  US-WI              
US-ME                  2020-11-30  US-ME              
US-AR                  2020-11-23  US-AR              
US-HI                  2020-06-08  US-HI              
US-RI                  2020-09-07  US-RI              

                                   symptom:Abdominal obesity  \
open_covid_region_code date                                    
US-WI                  2020-03-30  2.96                        
US-ME                  2020-11-30  1.57                        
US-AR                  2020-11-23  2.17                        
US-HI                  2020-06-08  5.49                        
US-RI                  2020-09-07  2.52                        

                                   symptom:Abdominal pain  ...  \
open_covid_region_code date                                ...   
US-WI                  2020-03-30  4.45                    ...   
US-ME                  2020-11-30  3.83                    ...   
US-AR                  2020-11-23  6.14                    ...   
US-HI                  2020-06-08  6.23                    ...   
US-RI                  2020-09-07  5.42                    ...   

                                   symptom:hyperhidrosis  \
open_covid_region_code date                                
US-WI                  2020-03-30  0.52                    
US-ME                  2020-11-30  0.44                    
US-AR                  2020-11-23  0.52                    
US-HI                  2020-06-08  0.81                    
US-RI                  2020-09-07  0.63                    

                                   symptom:pancreatitis  state_codes  \
open_covid_region_code date                                            
US-WI                  2020-03-30  0.34                  WI            
US-ME                  2020-11-30  0.43                  ME            
US-AR                  2020-11-23  0.58                  AR            
US-HI                  2020-06-08  0.49                  HI            
US-RI                  2020-09-07  0.46                  RI            

                                    region_name  hospitalized_new  
open_covid_region_code date                                        
US-WI                  2020-03-30  Wisconsin     624.0             
US-ME                  2020-11-30  Maine         76.0              
US-AR                  2020-11-23  Arkansas      371.0             
US-HI                  2020-06-08  Hawaii        6.0               
US-RI                  2020-09-07  Rhode Island  57.0              

[5 rows x 428 columns]
(1356, 428)
<ipython-input-3-281656ca80b5>:7: FutureWarning: Passing a negative integer is deprecated in version 1.0 and will not be supported in future version. Instead, use None to not limit the column width.
  pd.set_option('display.max_colwidth', -1)

Part 2: Visualization

Now, we visualize the data from the symptoms dataset after computing the mean column values to eliminate all but the most common symptoms, and then choosing the most relevant of those to COVID-19.

In [4]:
#visualization of weekly dataset
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
df = pd.read_csv('out.csv')
df=df.dropna(axis=1, thresh=100)
#keep only sympoms (columns) with highest mean search frequency
means = df.mean()
most_pop = [ind for m,ind in zip(means,means.index) if m>6]
state_codes = df['state_codes']
date = df['date']
open_covid_region_code = df['open_covid_region_code']
df = df[most_pop]
df['state_codes']= state_codes
df['open_covid_region_code'] = open_covid_region_code
df['date'] = date
df['date_by_week'] = date
df = df.set_index(['open_covid_region_code', 'date']) 
print(most_pop)
print(df.head(5))
fig = px.choropleth(df, locations=df['state_codes'],
                    color=df['symptom:Cough'],
                    hover_name=df['symptom:Cough'],
                    locationmode = 'USA-states',
                    animation_frame=df['date_by_week'],
                    title = "'Cough' Google Search Frequency in USA",
                    scope = "usa",
                   color_continuous_scale=px.colors.sequential.PuRd)
fig["layout"].pop("updatemenus")

fig2 = px.choropleth(df, locations=df['state_codes'],
                    color=df['symptom:Common cold'],
                    hover_name=df['symptom:Common cold'],
                    locationmode = 'USA-states',
                    animation_frame=df['date_by_week'],
                    title = "'Common Cold' Google Search Frequency in USA",
                    scope = "usa",
                   color_continuous_scale=px.colors.sequential.PuRd)
fig2["layout"].pop("updatemenus")

fig3 = px.choropleth(df, locations=df['state_codes'],
                    color=df['symptom:Fever'],
                    hover_name=df['symptom:Fever'],
                    locationmode = 'USA-states',
                    animation_frame=df['date_by_week'],
                    title = "'Fever' Google Search Frequency in USA",
                    scope = "usa",
                   color_continuous_scale=px.colors.sequential.PuRd)
fig3["layout"].pop("updatemenus")

fig4 = px.choropleth(df, locations=df['state_codes'],
                    color=df['symptom:Infection'],
                    hover_name=df['symptom:Infection'],
                    locationmode = 'USA-states',
                    animation_frame=df['date_by_week'],
                    title = "'Infection' Google Search Frequency in USA",
                    scope = "usa",
                   color_continuous_scale=px.colors.sequential.PuRd)
fig4["layout"].pop("updatemenus")

fig5 = px.choropleth(df, locations=df['state_codes'],
                    color=df['symptom:Pain'],
                    hover_name=df['symptom:Pain'],
                    locationmode = 'USA-states',
                    animation_frame=df['date_by_week'],
                    title = "'Pain' Google Search Frequency in USA",
                    scope = "usa",
                   color_continuous_scale=px.colors.sequential.PuRd)
fig5["layout"].pop("updatemenus")

fig6 = px.choropleth(df, locations=df['state_codes'],
                    color=df['symptom:Anxiety'],
                    hover_name=df['symptom:Anxiety'],
                    locationmode = 'USA-states',
                    animation_frame=df['date_by_week'],
                    title = "'Anxiety' Google Search Frequency in USA",
                    scope = "usa",
                   color_continuous_scale=px.colors.sequential.PuRd)
fig6["layout"].pop("updatemenus")




fig.show()
fig2.show()
fig3.show()
fig4.show()
fig5.show()
fig6.show()
['symptom:Acne', 'symptom:Allergy', 'symptom:Anxiety', 'symptom:Arthritis', 'symptom:Back pain', 'symptom:Common cold', 'symptom:Cough', 'symptom:Diabetes', 'symptom:Fever', 'symptom:Hypertension', 'symptom:Infection', 'symptom:Inflammation', 'symptom:Itch', 'symptom:Obesity', 'symptom:Pain', 'symptom:Skin rash', 'symptom:Swelling', 'hospitalized_new']
                                   symptom:Acne  symptom:Allergy  \
open_covid_region_code date                                        
US-AL                  2020-01-06  11.05         13.21             
                       2020-01-13  10.82         13.23             
                       2020-01-20  11.50         14.25             
                       2020-01-27  10.56         14.19             
                       2020-02-03  10.60         14.14             

                                   symptom:Anxiety  symptom:Arthritis  \
open_covid_region_code date                                             
US-AL                  2020-01-06  9.91             7.02                
                       2020-01-13  10.09            7.31                
                       2020-01-20  10.10            7.22                
                       2020-01-27  10.45            7.03                
                       2020-02-03  10.25            6.85                

                                   symptom:Back pain  ...  symptom:Skin rash  \
open_covid_region_code date                           ...                      
US-AL                  2020-01-06  7.58               ...  11.73               
                       2020-01-13  7.69               ...  8.96                
                       2020-01-20  7.42               ...  9.47                
                       2020-01-27  7.34               ...  11.35               
                       2020-02-03  7.27               ...  8.57                

                                   symptom:Swelling  hospitalized_new  \
open_covid_region_code date                                             
US-AL                  2020-01-06  7.26             NaN                 
                       2020-01-13  7.31             NaN                 
                       2020-01-20  7.43             NaN                 
                       2020-01-27  7.42             NaN                 
                       2020-02-03  7.26             NaN                 

                                   state_codes  date_by_week  
open_covid_region_code date                                   
US-AL                  2020-01-06  AL           2020-01-06    
                       2020-01-13  AL           2020-01-13    
                       2020-01-20  AL           2020-01-20    
                       2020-01-27  AL           2020-01-27    
                       2020-02-03  AL           2020-02-03    

[5 rows x 20 columns]
In [ ]: